# Web Scraping Job Boards: Indeed
This tool has evolved from Jean-Nicholas Hould's Web Scraping for Beers web page.  This example programme uses the Indeed Job Board to search for advertised roles.  The code constructs a text string URL comprising the search term, posting age and page number.  It cycles through each page locating the HTML tags which define distinct jobs and stores the relevant job details.

In [1]:
# Import relevant Python packages
# Python 3.6 :: Anaconda 4.3.1 (x86_64)
# pandas==0.19.2
# beautifulsoup4==4.5.3
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
import re
import time

In [2]:
# Determines if URL (httpstring) is a valid web page link.
# Module attempts to open URL and returns True if successful or False if unsucessful
def is_link(httpstring):
    try:
        html = urlopen(httpstring)
    except IOError:
        weblink= False
    else:
        weblink = True
        html.close
    return ( weblink)

In [4]:
def get_all_jobs():
    # List of job search terms
    entertopic = ["Data+Scientist","Machine+Learning","Artificial+Intelligence","Data+Science"]
    # Number of search pages to cycle through for each search term
    # It is advisable to try each search term on the website and confirm the relevant number of pages to use 
    numpages = [8,25,9,17]

    # Initialise the Panda frame 'jobs'
    jobs = []
    # Loop through each search term (j represents the search term index in array 'entertopic')
    for j in range(0,4):
        # Loop through each URL page (i represents the page index in array 'numpages')
        for i in range(0, numpages[j]):
            # Examine the website URL to understand its structure and the parameters
            # Create the URL using the Indeed stem, search term, job age, number of jobs per webpage and page number
            httpstring = "https://www.indeed.co.uk/jobs?&as_phr=" + entertopic[j] + "&fromage=15&limit=50&sort=date&start=" + str(i*50)
            print(i,httpstring)

            # Include a delay between loading webpages.  This is good etiquette to avoid impacting website performance 
            time.sleep(4)
            # Check whether the URL created is a valid webpage
            if is_link(httpstring):
                html = urlopen(httpstring)
                html_soup = BeautifulSoup(html, 'html.parser')

                # Using Beautiful Soup select all HTML 'div' tags on the webpage
                all_div_tags = html_soup.find_all("div")
                # Step through each HTML 'div' tag 
                for h2_tags in all_div_tags:
                    # Initialise variables which will hold the job information
                    websource = "Indeed"
                    job_title = ""
                    joblocation = ""
                    jobsalary = ""
                    employer = ""
                    weblink = ""
                    jobtype = ""
                    jobdetail = ""
                    posteddate = ""
                    # Find HTML 'h2' tag. Check one exists
                    if h2_tags.h2 != None:
                        # Job title is text in <h2> <a> tag
                        job_title = h2_tags.h2.a.text
                        # Web link is href in <h2> <a> tag
                        weblink='https://www.indeed.co.uk' + h2_tags.h2.a['href']

                        # Find HTML 'span' tag which has 'class'='location'. Check one exists
                        if h2_tags.find("span",{'class': 'location'}) != None:
                            # Job location is text in 'span' tag which has 'class' = 'location'
                            joblocation = h2_tags.find("span",{'class': 'location'}).text

                        # Find HTML 'span' tag which has 'class'='company'. Check one exists
                        employer_tag = h2_tags.find("span",{'class':'company'})
                        if employer_tag != None:
                            employer_tag_a = employer_tag.a
                            # Employer is text in <a> tag of 'span' tag which has 'class'='company' if it exists
                            if employer_tag_a != None:
                                employer = employer_tag_a.get_text().replace('\n','')

                            elif employer_tag != None:
                                employer = employer_tag.get_text().replace('\n','')

                        # Find HTML 'span' tag which has 'class'='no-wrap'. Check one exists
                        salary_tag = h2_tags.find("span",{'class':'no-wrap'})
                        if salary_tag != None:
                            # Salary is text in 'span' tag which has 'class'='no-wrap'
                            jobsalary = salary_tag.text.replace('\n','')
                        # Assign job details to 'job_list' dataframe 
                        job_list = {
                            "websource" : websource,
                            "jobtitle" : job_title,
                            "Joblocation" : joblocation,
                            "jobsalary" : jobsalary,
                            "employer" : employer,
                            "weblink" : weblink,
                            "jobtype" : jobtype,
                            "posteddate" : posteddate,
                            "jobdetail" : jobdetail,
                        }        
                        jobs.append(job_list)
     
    return jobs

In [5]:
job_list = get_all_jobs()    
# Add job_list to Panda dataframe 'jl'
jl = pd.DataFrame(job_list)
# Display the contents of the dataframe 'jl'
jl

0 https://www.indeed.co.uk/jobs?&as_phr=Data+Science&fromage=15&limit=50&sort=date&start=0
1 https://www.indeed.co.uk/jobs?&as_phr=Data+Science&fromage=15&limit=50&sort=date&start=50
2 https://www.indeed.co.uk/jobs?&as_phr=Data+Science&fromage=15&limit=50&sort=date&start=100
3 https://www.indeed.co.uk/jobs?&as_phr=Data+Science&fromage=15&limit=50&sort=date&start=150
4 https://www.indeed.co.uk/jobs?&as_phr=Data+Science&fromage=15&limit=50&sort=date&start=200
5 https://www.indeed.co.uk/jobs?&as_phr=Data+Science&fromage=15&limit=50&sort=date&start=250
6 https://www.indeed.co.uk/jobs?&as_phr=Data+Science&fromage=15&limit=50&sort=date&start=300
7 https://www.indeed.co.uk/jobs?&as_phr=Data+Science&fromage=15&limit=50&sort=date&start=350
8 https://www.indeed.co.uk/jobs?&as_phr=Data+Science&fromage=15&limit=50&sort=date&start=400
9 https://www.indeed.co.uk/jobs?&as_phr=Data+Science&fromage=15&limit=50&sort=date&start=450
10 https://www.indeed.co.uk/jobs?&as_phr=Data+Science&fromage=15&limit=50

Unnamed: 0,Joblocation,employer,jobdetail,jobsalary,jobtitle,jobtype,posteddate,weblink,websource
0,London,GreenTomatoCars,,,Data Scientist Intern,,,https://www.indeed.co.uk/rc/clk?jk=8a0fa396dca...,Indeed
1,London,Adhunter,,"£45,000 - £55,000 a year",Product Manager (Contract),,,https://www.indeed.co.uk/rc/clk?jk=e31d1bc7bea...,Indeed
2,Reading,IQVIA,,,RFI Developer,,,https://www.indeed.co.uk/rc/clk?jk=02fd2503cfa...,Indeed
3,London,Hitachi Consulting UK Limited,,,Data Scientist (Pre-Sales),,,https://www.indeed.co.uk/rc/clk?jk=fe94e6985e9...,Indeed
4,Belfast,ShopKeep,,,Senior Data Analyst,,,https://www.indeed.co.uk/rc/clk?jk=4a6919872db...,Indeed
5,England,University of London,,"£40,000 - £60,000 a year",Machine Learning Researcher (Reinforcement Lea...,,,https://www.indeed.co.uk/rc/clk?jk=0ea91fbf2cc...,Indeed
6,London,Hitachi Consulting UK Limited,,,"Project Manager - Analytics, Insights & Digital",,,https://www.indeed.co.uk/rc/clk?jk=e2d4d8134a8...,Indeed
7,London,The Economist Group,,,Analytics Implementation Specialist,,,https://www.indeed.co.uk/rc/clk?jk=90806eb5acd...,Indeed
8,London,PeoplePerHour.com,,,BI Analyst,,,https://www.indeed.co.uk/rc/clk?jk=342498ff56a...,Indeed
9,London,Hitachi Consulting UK Limited,,,Business Development Executive - Analytics,,,https://www.indeed.co.uk/rc/clk?jk=c38c9628a45...,Indeed


In [6]:
# Create a new dataframe with reordered fields
jlnew = jl[['websource','employer','jobtitle','Joblocation','jobsalary','jobtype','posteddate','jobdetail','weblink']]
# Export the fields of the 'jlnew' dataframe to a new CSV file
jlnew.to_csv('Jobs_Indeed_20180421.csv')
jlnew.iloc[0,0]

'Indeed'